Calculate Highly Variable Features And Get mC Fraction AnnData

Purpose

The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature’s normalized dispersion among cells.

Input

  • Filtered cell metadata;

  • MCDS files;

  • Feature list from basic feature filtering

Output

  • cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

Import

import pathlib
import pandas as pd
import xarray as xr
import dask
import ALLCools
from ALLCools.mcds import MCDS
from ALLCools.clustering import feature_enrichment

Parameters

# If True, will load all data into memory.
# Computation will be much faster, but also very memory intensive, only use this for small number of cells (<10,000)
load = True

# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'

# change this to the paths to your MCDS files
mcds_path_list = list(pathlib.Path('../../../data/Brain/snmC-seq2/').glob('*.mcds'))

# Feature list after basic filter
feature_path = 'FeatureList.BasicFilter.txt'

# Dimension name used to do clustering
obs_dim = 'cell'  # observation
var_dim = 'chrom100k'  # feature

# HVF method:
# SVR: regression based
# Bins: normalize dispersion per bin
hvf_method = 'SVR'
mch_pattern = 'CHN'
mcg_pattern = 'CGN'
n_top_feature = 20000

# Downsample cells
downsample = 20000

Load Data

Metadata

metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 16985 cells
metadata.head()
AllcPath mCCCFrac mCGFrac mCGFracAdj mCHFrac mCHFracAdj FinalReads InputReads MappedReads DissectionRegion BamFilteringRate MappingRate Plate Col384 Row384 FANSDate Slice Sample
10E_M_0 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.008198 0.822633 0.821166 0.041640 0.033718 1626504.0 4407752 2892347.0 10E 0.562347 0.656195 CEMBA190625-10E-1 0 0 190625 10 10E_190625
10E_M_1 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006019 0.743035 0.741479 0.024127 0.018218 2009998.0 5524084 3657352.0 10E 0.549577 0.662074 CEMBA190625-10E-1 0 1 190625 10 10E_190625
10E_M_10 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006569 0.750172 0.748520 0.027665 0.021235 1383636.0 3455260 2172987.0 10E 0.636744 0.628892 CEMBA190625-10E-1 19 0 190625 10 10E_190625
10E_M_101 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.006353 0.760898 0.759369 0.026547 0.020323 2474670.0 7245482 4778768.0 10E 0.517847 0.659551 CEMBA190625-10E-1 18 3 190625 10 10E_190625
10E_M_102 /gale/raidix/rdx-4/mapping/10E/CEMBA190625-10E... 0.005409 0.752980 0.751637 0.019497 0.014164 2430290.0 7004754 4609570.0 10E 0.527227 0.658063 CEMBA190625-10E-1 19 2 190625 10 10E_190625
use_features = pd.read_csv(feature_path, header=None, index_col=0).index
use_features.name = var_dim

MCDS

with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    # still use all the cells to load MCDS
    total_mcds = MCDS.open(mcds_path_list,
                           obs_dim=obs_dim,
                           use_obs=metadata.index).sel({var_dim: use_features})

Add mC Rate

total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds
<xarray.MCDS>
Dimensions:              (cell: 16985, chrom100k: 24045, count_type: 2, geneslop2k: 55487, mc_type: 2)
Coordinates:
  * mc_type              (mc_type) object 'CGN' 'CHN'
  * cell                 (cell) object '10E_M_0' '10E_M_1' ... '9J_M_3036'
  * count_type           (count_type) object 'mc' 'cov'
    strand_type          <U4 'both'
  * chrom100k            (chrom100k) int64 30 31 32 33 ... 26335 26336 26337
    chrom100k_chrom      (chrom100k) object dask.array<chunksize=(24045,), meta=np.ndarray>
    chrom100k_bin_start  (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray>
    chrom100k_bin_end    (chrom100k) int64 dask.array<chunksize=(24045,), meta=np.ndarray>
  * geneslop2k           (geneslop2k) object 'ENSMUSG00000102693.1' ... 'ENSM...
    geneslop2k_chrom     (geneslop2k) object dask.array<chunksize=(55487,), meta=np.ndarray>
    geneslop2k_start     (geneslop2k) int64 dask.array<chunksize=(55487,), meta=np.ndarray>
    geneslop2k_end       (geneslop2k) int64 dask.array<chunksize=(55487,), meta=np.ndarray>
Data variables:
    chrom100k_da         (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(473, 24045, 2, 2), meta=np.ndarray>
    geneslop2k_da        (cell, geneslop2k, mc_type, count_type) uint32 dask.array<chunksize=(473, 55487, 2, 2), meta=np.ndarray>
    chrom100k_da_frac    (cell, chrom100k, mc_type) float64 dask.array<chunksize=(473, 24045, 2), meta=np.ndarray>

If downsample

if downsample and total_cells > downsample:
    # make a downsampled mcds
    print(f'Downsample cells to {downsample} to calculate HVF.')
    downsample_cell_ids = metadata.sample(downsample, random_state=0).index
    mcds = total_mcds.sel(
        {obs_dim: total_mcds.get_index(obs_dim).isin(downsample_cell_ids)})
else:
    mcds = total_mcds
if load and (mcds.get_index('cell').size <= 20000):
    # load the relevant data so the computation can be fater, watch out memory!
    mcds[f'{var_dim}_da_frac'].load()
/home/hanliu/miniconda3/envs/allcools/lib/python3.8/site-packages/dask/core.py:121: RuntimeWarning: invalid value encountered in true_divide
  return func(*(_execute_task(a, cache) for a in args))

The RuntimeWarning is expected (due to cov == 0). You can ignore it.

Highly Variable Feature

mCH

if hvf_method == 'SVR':
    # use SVR based method
    mch_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mch_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mch_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mch_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.05,
                                 cov_binsize=100)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24045
Highly Variable Feature:  20000 (83.2%)

Save AnnData

total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
    f'{var_dim}_{mch_pattern}_feature_select']
mch_adata = total_mcds.get_adata(mc_type=mch_pattern,
                           var_dim=var_dim,
                           select_hvf=True)

mch_adata.write_h5ad(f'mCH.HVF.h5ad')

mch_adata
... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
    var: 'chrom', 'bin_start', 'bin_end', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select'

mCG

if hvf_method == 'SVR':
    # use SVR based method
    mcg_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mcg_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mcg_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mcg_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.02,
                                 cov_binsize=20)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24045
Highly Variable Feature:  20000 (83.2%)

Save AnnData

total_mcds.coords[f'{var_dim}_{mch_pattern}_feature_select'] = mcds.coords[
    f'{var_dim}_{mch_pattern}_feature_select']
mcg_adata = total_mcds.get_adata(mc_type=mcg_pattern,
                                 var_dim=var_dim,
                                 select_hvf=True)

mcg_adata.write_h5ad(f'mCG.HVF.h5ad')

mcg_adata
... storing 'chrom' as categorical
AnnData object with n_obs × n_vars = 16985 × 20000
    var: 'chrom', 'bin_start', 'bin_end', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'